Pepsi
#pip install wordcloud
#pip install gensim
#pip install spacy
#pip install nltk
#pip install textblob
#pip install spacy
import sys
sys.executable
'C:\\Users\\zcart\\anaconda3\\python.exe'
Importing Libraries
import numpy as np
import pandas as pd
import os
import random
from sklearn.utils import shuffle
#for text cleaning
import string
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from collections import Counter
# clustering
from gensim.models import word2vec, KeyedVectors
from sklearn.cluster import KMeans
from sklearn.neighbors import KDTree
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
import re;
import logging;
import sqlite3;
import time;
import sys;
import multiprocessing;
from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS
import matplotlib.pyplot as plt;
from itertools import cycle;
from tqdm import tqdm
tqdm.pandas()
Data Exploration
SEED = 2021
#Importing the dataset
data = pd.read_csv("covidvaccine.csv")
C:\Users\zcart\anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3444: DtypeWarning: Columns (5,6,7,12) have mixed types.Specify dtype option on import or set low_memory=False. exec(code_obj, self.user_global_ns, self.user_ns)
data.head()
| user_name | user_location | user_description | user_created | user_followers | user_friends | user_favourites | user_verified | date | text | hashtags | source | is_retweet | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | MyNewsNE | Assam | MyNewsNE a dedicated multi-lingual media house... | 24-05-2020 10:18 | 64.0 | 11.0 | 110.0 | False | 18-08-2020 12:55 | Australia to Manufacture Covid-19 Vaccine and ... | ['CovidVaccine'] | Twitter Web App | False |
| 1 | Shubham Gupta | NaN | I will tell about all experiences of my life f... | 14-08-2020 16:42 | 1.0 | 17.0 | 0.0 | False | 18-08-2020 12:55 | #CoronavirusVaccine #CoronaVaccine #CovidVacci... | ['CoronavirusVaccine', 'CoronaVaccine', 'Covid... | Twitter for Android | False |
| 2 | Journal of Infectiology | NaN | Journal of Infectiology (ISSN 2689-9981) is ac... | 14-12-2017 07:07 | 143.0 | 566.0 | 8.0 | False | 18-08-2020 12:46 | Deaths due to COVID-19 in Affected Countries\n... | NaN | Twitter Web App | False |
| 3 | Zane | NaN | Fresher than you. | 18-09-2019 11:01 | 29.0 | 25.0 | 620.0 | False | 18-08-2020 12:45 | @Team_Subhashree @subhashreesotwe @iamrajchoco... | NaN | Twitter for Android | False |
| 4 | Ann-Maree O’Connor | Adelaide, South Australia | Retired university administrator. Melburnian b... | 24-01-2013 14:53 | 83.0 | 497.0 | 10737.0 | False | 18-08-2020 12:45 | @michellegrattan @ConversationEDU This is what... | NaN | Twitter Web App | False |
random.seed(SEED)
data.iloc[random.randint(0, len(data))]
user_name Civic Medical Centre user_location Bebington user_description Civic Medical Centre (CH63) user_created 2015-04-16 10:11:54 user_followers 234.0 user_friends 70 user_favourites 83 user_verified False date 2021-02-04 12:01:15 text By prioritising the first #CovidVaccine dose, ... hashtags ['CovidVaccine'] source Hootsuite Inc. is_retweet False Name: 105960, dtype: object
Data Cleaning
def clean_text(txt):
contraction_dict = {
"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have",
"couldn't": "could not", "didn't": "did not", "doesn't": "does not", "don't": "do not",
"hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will",
"he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
"I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am",
"I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will", "i'll've": "i will have",
"i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have",
"it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",
"mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have",
"must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not",
"needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not",
"oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
"she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have",
"she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have",
"so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have",
"that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is",
"here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will",
"they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",
"wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will",
"we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will",
"what'll've": "what will have", "what're": "what are", "what's": "what is", "what've": "what have",
"when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",
"where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is",
"who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not",
"won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have",
"y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are",
"y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will",
"you'll've": "you will have", "you're": "you are", "you've": "you have"}
def _get_contractions(contraction_dict):
contraction_re = re.compile('(%s)' % '|'.join(contraction_dict.keys()))
return contraction_dict, contraction_re
def replace_contractions(text):
contractions, contractions_re = _get_contractions(contraction_dict)
def replace(match):
return contractions[match.group(0)]
return contractions_re.sub(replace, text)
# replace contractions
txt = replace_contractions(txt)
#remove punctuations
txt = "".join([char for char in txt if char not in string.punctuation])
txt = re.sub('[0-9]+', '', txt)
# split into words
words = word_tokenize(txt)
# remove stopwords
stop_words = set(stopwords.words('english'))
words = [w for w in words if not w in stop_words]
# removing leftover punctuations
words = [word for word in words if word.isalpha()]
# lower-case everything
words = [w.lower() for w in words]
# stem the words
porter = PorterStemmer()
words = [porter.stem(w) for w in words]
cleaned_text = ' '.join(words)
return cleaned_text
data = data[['text', 'hashtags']].fillna('')
data.head()
data['raw_tweet'] = data['text'] + ' ' + data['hashtags']
data.head()
| text | hashtags | raw_tweet | |
|---|---|---|---|
| 0 | Australia to Manufacture Covid-19 Vaccine and ... | ['CovidVaccine'] | Australia to Manufacture Covid-19 Vaccine and ... |
| 1 | #CoronavirusVaccine #CoronaVaccine #CovidVacci... | ['CoronavirusVaccine', 'CoronaVaccine', 'Covid... | #CoronavirusVaccine #CoronaVaccine #CovidVacci... |
| 2 | Deaths due to COVID-19 in Affected Countries\n... | Deaths due to COVID-19 in Affected Countries\n... | |
| 3 | @Team_Subhashree @subhashreesotwe @iamrajchoco... | @Team_Subhashree @subhashreesotwe @iamrajchoco... | |
| 4 | @michellegrattan @ConversationEDU This is what... | @michellegrattan @ConversationEDU This is what... |
#nltk.download()
data['tweet'] = data['raw_tweet'].progress_apply(lambda txt: clean_text(txt))
data.head()
100%|██████████| 207006/207006 [03:35<00:00, 959.99it/s]
| text | hashtags | raw_tweet | tweet | |
|---|---|---|---|---|
| 0 | Australia to Manufacture Covid-19 Vaccine and ... | ['CovidVaccine'] | Australia to Manufacture Covid-19 Vaccine and ... | australia manufactur covid vaccin give citizen... |
| 1 | #CoronavirusVaccine #CoronaVaccine #CovidVacci... | ['CoronavirusVaccine', 'CoronaVaccine', 'Covid... | #CoronavirusVaccine #CoronaVaccine #CovidVacci... | coronavirusvaccin coronavaccin covidvaccin aus... |
| 2 | Deaths due to COVID-19 in Affected Countries\n... | Deaths due to COVID-19 in Affected Countries\n... | death due covid affect countri read more https... | |
| 3 | @Team_Subhashree @subhashreesotwe @iamrajchoco... | @Team_Subhashree @subhashreesotwe @iamrajchoco... | teamsubhashre subhashreesotw iamrajchoco stay ... | |
| 4 | @michellegrattan @ConversationEDU This is what... | @michellegrattan @ConversationEDU This is what... | michellegrattan conversationedu thi pass leade... |
data.sort_values(by=['tweet']).head()
# still a bunch of empty lists in the dataset. We will have to remove them before clustering.
| text | hashtags | raw_tweet | tweet | |
|---|---|---|---|---|
| 193230 | ||||
| 64305 | ||||
| 196263 | ||||
| 27430 | ||||
| 45326 |
#remove the empty strings
df = data[data['tweet'].astype(bool)]
df.sort_values(by=['tweet']).head()
| text | hashtags | raw_tweet | tweet | |
|---|---|---|---|---|
| 111391 | A: A good night's sleep and a run before your ... | ['CovidVaccine'] | A: A good night's sleep and a run before your ... | a a good night sleep run appt who knew covidva... |
| 129742 | ⁉️Q\n💡 A\nAbout #CovidVaccine \n(AstraZeneca/O... | ['CovidVaccine', 'COVID19'] | ⁉️Q\n💡 A\nAbout #CovidVaccine \n(AstraZeneca/O... | a about covidvaccin astrazenecaoxford do vacci... |
| 117834 | ⁉️Q\n💡 A\nAbout #CovidVaccine \n(AstraZeneca/O... | ['CovidVaccine'] | ⁉️Q\n💡 A\nAbout #CovidVaccine \n(AstraZeneca/O... | a about covidvaccin astrazenecaoxford how targ... |
| 115480 | ⁉️Q\n💡 A\nAbout #CovidVaccine \n(AstraZeneca/O... | ['CovidVaccine'] | ⁉️Q\n💡 A\nAbout #CovidVaccine \n(AstraZeneca/O... | a about covidvaccin astrazenecaoxford it possi... |
| 113448 | ⁉️Q\n💡 A\nAbout #CovidVaccine \n(AstraZeneca/O... | ['CovidVaccine'] | ⁉️Q\n💡 A\nAbout #CovidVaccine \n(AstraZeneca/O... | a about covidvaccin astrazenecaoxford vaccin s... |
# I only want to keep hashtags and tweet
df = df[['hashtags', 'tweet']]
df.sort_values(by=['tweet']).head()
| hashtags | tweet | |
|---|---|---|
| 111391 | ['CovidVaccine'] | a a good night sleep run appt who knew covidva... |
| 129742 | ['CovidVaccine', 'COVID19'] | a about covidvaccin astrazenecaoxford do vacci... |
| 117834 | ['CovidVaccine'] | a about covidvaccin astrazenecaoxford how targ... |
| 115480 | ['CovidVaccine'] | a about covidvaccin astrazenecaoxford it possi... |
| 113448 | ['CovidVaccine'] | a about covidvaccin astrazenecaoxford vaccin s... |
df['tweet_split'] = df.tweet.str.split()
df.head()
| hashtags | tweet | tweet_split | |
|---|---|---|---|
| 0 | ['CovidVaccine'] | australia manufactur covid vaccin give citizen... | [australia, manufactur, covid, vaccin, give, c... |
| 1 | ['CoronavirusVaccine', 'CoronaVaccine', 'Covid... | coronavirusvaccin coronavaccin covidvaccin aus... | [coronavirusvaccin, coronavaccin, covidvaccin,... |
| 2 | death due covid affect countri read more https... | [death, due, covid, affect, countri, read, mor... | |
| 3 | teamsubhashre subhashreesotw iamrajchoco stay ... | [teamsubhashre, subhashreesotw, iamrajchoco, s... | |
| 4 | michellegrattan conversationedu thi pass leade... | [michellegrattan, conversationedu, thi, pass, ... |
Clustering using HushingVectorizer in sklearn
# find out the longest tweet
max_len = df.tweet.str.len().max()
print(max_len)
815
#njarbouuu
from sklearn.feature_extraction.text import HashingVectorizer
# list of text documents
text = ["The quick brown fox jumped over the lazy dog."]
# create the transform
vectorizer = HashingVectorizer(n_features=20)
# encode document
vector = vectorizer.transform(text)
# summarize encoded vector
print(vector.shape)
print(vector.toarray())
(1, 20) [[ 0. 0. 0. 0. 0. 0.33333333 0. -0.33333333 0.33333333 0. 0. 0.33333333 0. 0. 0. -0.33333333 0. 0. -0.66666667 0. ]]
#lenna sabbina data teena document f hashinvectorizer bech el klem eli kif kif n7otouh fard blassa
vectorizer = HashingVectorizer (n_features = max_len)
df['vector'] = df['tweet'].progress_apply(lambda t: vectorizer.fit_transform([t]).toarray())
df.head()
100%|██████████| 206999/206999 [00:53<00:00, 3894.50it/s]
| hashtags | tweet | tweet_split | vector | |
|---|---|---|---|---|
| 0 | ['CovidVaccine'] | australia manufactur covid vaccin give citizen... | [australia, manufactur, covid, vaccin, give, c... | [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... |
| 1 | ['CoronavirusVaccine', 'CoronaVaccine', 'Covid... | coronavirusvaccin coronavaccin covidvaccin aus... | [coronavirusvaccin, coronavaccin, covidvaccin,... | [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... |
| 2 | death due covid affect countri read more https... | [death, due, covid, affect, countri, read, mor... | [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... | |
| 3 | teamsubhashre subhashreesotw iamrajchoco stay ... | [teamsubhashre, subhashreesotw, iamrajchoco, s... | [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... | |
| 4 | michellegrattan conversationedu thi pass leade... | [michellegrattan, conversationedu, thi, pass, ... | [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... |
X = np.concatenate(df['vector'].values)
kmeans = KMeans(n_clusters = 4)
df['cluster'] = kmeans.fit_predict(X)
PCA for visualization
pca = PCA(n_components=3)
pca_result = pca.fit_transform(X)
df['x'] = pca_result[:, 0]
df['y'] = pca_result[:, 1]
#df['z'] = pca_result[:, 2]
df.head()
| hashtags | tweet | tweet_split | vector | cluster | x | y | |
|---|---|---|---|---|---|---|---|
| 0 | ['CovidVaccine'] | australia manufactur covid vaccin give citizen... | [australia, manufactur, covid, vaccin, give, c... | [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... | 2 | -0.162390 | 0.242870 |
| 1 | ['CoronavirusVaccine', 'CoronaVaccine', 'Covid... | coronavirusvaccin coronavaccin covidvaccin aus... | [coronavirusvaccin, coronavaccin, covidvaccin,... | [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... | 3 | -0.270168 | -0.069966 |
| 2 | death due covid affect countri read more https... | [death, due, covid, affect, countri, read, mor... | [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... | 0 | 0.276784 | 0.099291 | |
| 3 | teamsubhashre subhashreesotw iamrajchoco stay ... | [teamsubhashre, subhashreesotw, iamrajchoco, s... | [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... | 0 | 0.234544 | -0.186993 | |
| 4 | michellegrattan conversationedu thi pass leade... | [michellegrattan, conversationedu, thi, pass, ... | [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... | 0 | 0.235818 | -0.189325 |
cluster_colors = np.array(['#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080', '#ffffff', '#000000'])
df['color'] = cluster_colors[df.cluster.values]
df['text'] = df.tweet.str[:50]
import bokeh.io
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, LabelSet
# from bokeh.charts import Donut, HeatMap, Histogram, Line, Scatter, show, output_notebook, output_file
bokeh.io.output_notebook()
#visualize the data using bokeh
#output_file("top_artists.html", title="top artists")
# TOOLS = "pan,wheel_zoom,box_zoom,reset,hover,previewsave"
source = ColumnDataSource.from_df(df[['x', 'y', 'color', 'text']])
TOOLTIPS = [("text", "@text")]
TOOLS = "pan,wheel_zoom,box_zoom,reset,hover,save"
plot = figure(plot_width=800, plot_height=450, tooltips=TOOLTIPS, tools=TOOLS)
#draw circles
plot.circle(y='y', x='x', source=source, size=15, fill_color='color')
show(plot)